Predicting the average total number of trips per hour across the week for unknown stations (not trained).
#Importing the census data csv file
library(data.table)
census = fread("data/London_census.csv", header = T,sep = ",", dec = ".")
head(census)
## WardCode WardName borough NESW AreaSqKm lon
## 1: E05000026 Abbey Barking and Dagenham East 1.3 0.077935
## 2: E05000027 Alibon Barking and Dagenham East 1.4 0.148270
## 3: E05000028 Becontree Barking and Dagenham East 1.3 0.118957
## 4: E05000029 Chadwell Heath Barking and Dagenham East 3.4 0.139985
## 5: E05000030 Eastbrook Barking and Dagenham East 3.5 0.173581
## 6: E05000031 Eastbury Barking and Dagenham East 1.4 0.105683
## lat IncomeScor LivingEnSc NoEmployee GrenSpace PopDen BornUK NotBornUK
## 1: 51.53971 0.27 42.76 7900 19.6 9884.6 5459 7327
## 2: 51.54559 0.28 27.96 800 22.4 7464.3 7824 2561
## 3: 51.55453 0.25 31.59 1100 3.0 8923.1 8075 3470
## 4: 51.58475 0.27 34.78 1700 56.4 2970.6 7539 2482
## 5: 51.55365 0.19 21.25 4000 51.1 3014.3 8514 1992
## 6: 51.53590 0.27 31.16 1000 18.1 8357.1 7880 3744
## NoCTFtoH NoDwelling NoFlats NoHouses NoOwndDwel MedHPrice
## 1: 0.1 4733 3153 1600 1545 177000
## 2: 0.1 4045 574 3471 1849 160000
## 3: 0.1 4378 837 3541 2093 170000
## 4: 0.4 4050 1400 2662 2148 195000
## 5: 0.5 3976 742 3235 2646 191750
## 6: 0.0 4321 933 3388 1913 167250
dim(census)
## [1] 625 20
uniqueN(census)
## [1] 625
uniqueN(census,by="WardName")
## [1] 604
uniqueN(census,by="borough")
## [1] 33
uniqueN(census,by="NESW")
## [1] 5
uniqueN(census,by="WardCode")
## [1] 625
The census data has 625 rows and 20 variables. From unique variable analysis, we can use the WardCode as our primary key when joining tables
library(gridExtra)
library(ggplot2)
p1 = ggplot(census,aes(NoEmployee))+geom_histogram(aes(fill=NESW),bins = 20)
p2 = ggplot(census,aes(MedHPrice))+geom_histogram(aes(fill=NESW),bins = 20)
grid.arrange(p1, p2, nrow=2)
The No of Employees and Median House prices have outliers which mainly in the central
#creating the population metric
census$Pop = census$PopDen * census$AreaSqKm
#Below independent variables have high correlation and provide the same information
cor(census$NoDwelling,(census$NoFlats+census$NoHouses))
## [1] 0.9985431
cor(census$NoCTFtoH,census$MedHPrice)
## [1] 0.8169244
cor(census$Pop,(census$BornUK+census$NotBornUK))
## [1] 0.999896
#Droping the redundant columns
census[,c("PopDen","NoCTFtoH","NoDwelling","Pop")]=NULL
# Normalizing the skewed variables, NoEmployee and MedHPrices
census$NoEmployee=log(census$NoEmployee)
census$MedHPrice=log(census$MedHPrice)
p1 = ggplot(census,aes(NoEmployee))+geom_histogram(aes(fill=NESW),bins = 20)
p2 = ggplot(census,aes(MedHPrice))+geom_histogram(aes(fill=NESW),bins = 20)
grid.arrange(p1, p2, nrow=2)
#Importing the stations data csv file
stations = fread("data/bike_stations.csv", header = T,sep = ",", dec = ".")
head(stations)
## Station_ID Capacity Latitude Longitude Station_Name
## 1: 1 19 51.52916 -0.109970 River Street , Clerkenwell
## 2: 2 37 51.49961 -0.197574 Phillimore Gardens, Kensington
## 3: 3 32 51.52128 -0.084605 Christopher Street, Liverpool Street
## 4: 4 23 51.53006 -0.120973 St. Chad's Street, King's Cross
## 5: 5 27 51.49313 -0.156876 Sedding Street, Sloane Square
## 6: 6 18 51.51812 -0.144228 Broadcasting House, Marylebone
dim(stations)
## [1] 773 5
The stations data have 773 stations and 5 variables describing the location and capacity of each station.
#Visualizing the intersection of the census and stations
ggplot(census,aes(x=lon,y=lat))+geom_point(col="red")+
geom_point(data=stations,aes(x=Longitude,y=Latitude),col="blue",alpha=0.25)+labs(x = "Longitude", y = "Latitude", title = "London Ward centers in Red and Stations Coords in blue")
#Zooming in the intersection
ggplot(census,aes(x=lon,y=lat))+geom_point(col="red")+
geom_point(data=stations,aes(x=Longitude,y=Latitude),col="blue",alpha=0.25)+
xlim(range(stations$Longitude))+ylim(range(stations$Latitude))+labs(x = "Longitude", y = "Latitude", title = "London Ward centers in Red and Stations Coords in blue")
## Warning: Removed 478 rows containing missing values (geom_point).
Using Kmeans with single iteration in order to find the nearest centroids which are the Ward centers coordinates that in trun will be assigned to the stations and hence the census and stations can have keys to merge.
# creating a single data table from census and stations data
combined = rbind(census[,c("lon","lat")],stations[,c("Longitude","Latitude")],use.names=FALSE)
clusters = kmeans(combined,combined[1:625,],iter.max = 1)
## Warning: did not converge in 1 iteration
#Assigning the nearest wardcode to the station name
stations$Station_Name=census$WardCode[clusters$cluster[626:nrow(combined)]]
#unique selected centroids for the stations
c = unique(clusters$cluster[626:nrow(combined)])
library(rgdal)
## Loading required package: sp
## rgdal: version: 1.4-8, (SVN revision 845)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 2.4.2, released 2019/06/28
## Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/3.6/Resources/library/rgdal/gdal
## GDAL binary built with GEOS: FALSE
## Loaded PROJ.4 runtime: Rel. 5.2.0, September 15th, 2018, [PJ_VERSION: 520]
## Path to PROJ.4 shared files: /Library/Frameworks/R.framework/Versions/3.6/Resources/library/rgdal/proj
## Linking to sp version: 1.3-2
library(sp)
library(rgeos)
## rgeos version: 0.5-2, (SVN revision 621)
## GEOS runtime version: 3.7.2-CAPI-1.11.2
## Linking to sp version: 1.3-1
## Polygon checking: TRUE
#Using London_Ward shape file from london Government website in order to plot the ward boundaries
m = readOGR(dsn=path.expand("./"), "London_Ward")
## OGR data source with driver: ESRI Shapefile
## Source: "/Users/islamhasabo/GitHub/Machine-Learning/R/London-Bikes", layer: "London_Ward"
## with 657 features
## It has 6 fields
# Transforming the coordinates to matach our data
m.wgs84 = spTransform(m, CRS("+proj=longlat +datum=WGS84"))
# Ploting
ggplot(m.wgs84)+geom_polygon(aes(x = long, y = lat, group = group), fill = "white", colour = "gray")+
xlim(range(stations$Longitude))+ylim(range(stations$Latitude))+
geom_point(data = census,aes(x=lon,y=lat),col="red")+
geom_point(data=stations,aes(x=Longitude,y=Latitude),col="blue",alpha=0.25)+
geom_point(data=census[c],aes(x=lon,y=lat),col="orange") +
labs(x = "Longitude", y = "Latitude", title = "Part of London Map with Ward boundaires",
subtitle = "Red dots for Ward Centers, Blue for Station Coords and Orange for the selected Ward Centers")
## Regions defined for each Polygons
## Warning: Removed 478 rows containing missing values (geom_point).
## Warning: Removed 4 rows containing missing values (geom_point).
#merging the census and stations data by the cenus WardCode and the stations station_name which is #the WardCode for the chosen WardCenter
merge1 = merge(census, stations, by.x="WardCode",by.y ="Station_Name")
#Calculating the distances between the station coordinates and the selected ward centers
library(geosphere)
di= distHaversine(merge1[,c("lon","lat")],merge1[,c("Longitude","Latitude")],r = 6378)
#Considering only single station for each ward center in order to overcome the repeated census data and reduce the stations belonging to a wrong ward issues.
merge1$di = di
merge1 = merge1 [order(di)]
merge11 =merge1[!duplicated(WardCode)]
head(merge11[,c("WardCode","Station_ID")])
## WardCode Station_ID
## 1: E05000623 609
## 2: E05000426 654
## 3: E05000392 442
## 4: E05000429 794
## 5: E05000391 559
## 6: E05000583 576
113 out of 773 have been selected. They are the ones with least distance to Ward Centers.
#Visualizing the final selected stations
ggplot(m.wgs84)+geom_polygon(aes(x = long, y = lat, group = group), fill = "white", colour = "gray")+
xlim(range(stations$Longitude))+ylim(range(stations$Latitude))+
geom_point(data = merge11,aes(x=Longitude,y=Latitude),col="blue",alpha=0.25)+
geom_point(data=census[c],aes(x=lon,y=lat),col="orange") +
labs(x = "Longitude", y = "Latitude", title = "Part of London Map with Ward boundaires",
subtitle = "Red dots for Ward Centers and Orange for the selected Ward Centers")
## Regions defined for each Polygons
## Warning: Removed 4 rows containing missing values (geom_point).
#Importing the journeys data scv file
journeys = fread("data/bike_journeys.csv", header = T,sep = ",", dec = ".")
head(journeys)
## Journey_Duration Journey_ID End_Date End_Month End_Year End_Hour End_Minute
## 1: 2040 953 19 9 17 18 0
## 2: 1800 12581 19 9 17 15 21
## 3: 1140 1159 15 9 17 17 1
## 4: 420 2375 14 9 17 12 16
## 5: 1200 14659 13 9 17 19 33
## 6: 1320 2351 14 9 17 14 53
## End_Station_ID Start_Date Start_Month Start_Year Start_Hour Start_Minute
## 1: 478 19 9 17 17 26
## 2: 122 19 9 17 14 51
## 3: 639 15 9 17 16 42
## 4: 755 14 9 17 12 9
## 5: 605 13 9 17 19 13
## 6: 514 14 9 17 14 31
## Start_Station_ID
## 1: 251
## 2: 550
## 3: 212
## 4: 163
## 5: 36
## 6: 589
dim(journeys)
## [1] 1542844 14
1542844 obs with 14 features to describe each trip start, end and duration
# considering the start attributes as indicatior for the demand
# Aggregate the number of trips per hour per station
j1 = journeys[,.N,by=.(Start_Station_ID,Start_Year,Start_Month,Start_Date,Start_Hour)]
# converting the data time attributes to the iso datetime format
d = ISOdatetime(j1$Start_Year+2000,j1$Start_Month,j1$Start_Date,j1$Start_Hour, min = 0, sec=0, tz = "")
#Extracting informative features like week of the year, day of the year,weekday and weekend
p = as.POSIXlt(d)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following object is masked from 'package:base':
##
## date
j1$Station=j1$Start_Station_ID
j1$Week=isoweek(p)
j1$Day = p$yday
j1$WeekDay=wday(p,label = TRUE,week_start = getOption("lubridate.week.start", 1))
j1$Hour=hour(p)
library(chron)
##
## Attaching package: 'chron'
## The following objects are masked from 'package:lubridate':
##
## days, hours, minutes, seconds, years
j1$WeekEnd = as.integer(as.logical(is.weekend(p)))
summary(j1)
## Start_Station_ID Start_Year Start_Month Start_Date Start_Hour
## Min. : 1.0 Min. :17 Min. :8.000 Min. : 1.00 Min. : 0.00
## 1st Qu.:187.0 1st Qu.:17 1st Qu.:8.000 1st Qu.: 7.00 1st Qu.: 9.00
## Median :381.0 Median :17 Median :8.000 Median :13.00 Median :14.00
## Mean :395.1 Mean :17 Mean :8.376 Mean :13.74 Mean :13.45
## 3rd Qu.:604.0 3rd Qu.:17 3rd Qu.:9.000 3rd Qu.:19.00 3rd Qu.:18.00
## Max. :826.0 Max. :17 Max. :9.000 Max. :31.00 Max. :23.00
##
## N Station Week Day WeekDay
## Min. : 1.000 Min. : 1.0 Min. :31.00 Min. :212.0 Mon:63809
## 1st Qu.: 1.000 1st Qu.:187.0 1st Qu.:32.00 1st Qu.:224.0 Tue:78012
## Median : 2.000 Median :381.0 Median :34.00 Median :236.0 Wed:61465
## Mean : 3.318 Mean :395.1 Mean :34.21 Mean :236.4 Thu:69243
## 3rd Qu.: 4.000 3rd Qu.:604.0 3rd Qu.:36.00 3rd Qu.:249.0 Fri:68325
## Max. :182.000 Max. :826.0 Max. :38.00 Max. :261.0 Sat:64433
## Sun:59681
## Hour WeekEnd
## Min. : 0.00 Min. :0.0000
## 1st Qu.: 9.00 1st Qu.:0.0000
## Median :14.00 Median :0.0000
## Mean :13.45 Mean :0.2669
## 3rd Qu.:18.00 3rd Qu.:1.0000
## Max. :23.00 Max. :1.0000
##
The Aggregated trips per hour have skewed right distribution. The journeys happened in continuous 50 days across 8 weeks between 826 stations.
# Reshaping the data to be a valide input for time series function
j2 = dcast(j1, Start_Year+Start_Month+Start_Date+Start_Hour~Start_Station_ID,value.var = "N")
# Imp
#Imputing the NA atriubutes due to no trips in some hours
j2[is.na(j2)]=0
library(forecast)
## Registered S3 method overwritten by 'xts':
## method from
## as.zoo.xts zoo
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
## method from
## fitted.fracdiff fracdiff
## residuals.fracdiff fracdiff
library(seasonal)
library(rugarch)
## Loading required package: parallel
##
## Attaching package: 'rugarch'
## The following object is masked from 'package:stats':
##
## sigma
# Visualizing the multi-seasonal time series for random station considering the daily and weekely seasons with hour time slots.
t = msts(j2$'129', seasonal.periods=c(24,168),ts.frequency=24,start=c(212,0))
#visualizing the time series
t %>% mstl(s.window="periodic", robust=TRUE) %>% autoplot(1) + xlab("Day")
#Forecasting using seasonal decomposition
t %>% stlf() %>% autoplot() + xlab("Day")
The seasonality across the one day interval is much stronger than one week interval. The Trend is almost steady. Forcasting the number of trips per hour per station can be done . However it varies from station to station which doesn’t meet our main objective.
#Visualizing the mean and sum of total trips per WeekDay of all stations over the total period of of our data, 50 days
j3 = j1[,.(sum(N),mean(N)),by=.(Station,Week,WeekDay,Day)]
p1 = ggplot(j3,aes(x=WeekDay,y=V1))+geom_col(fill="purple")+ylab("Sum")
p2 = ggplot(j3,aes(x=WeekDay,y=V2))+geom_col(fill="violet")+ylab("Mean")
grid.arrange(p1, p2, ncol=2)
Sum and Mean show simial distributing across the weekdays.
#Visualizing the mean total of trips per WeekDay of all stations in every week our period.
ggplot(j3,aes(x=WeekDay,y=V2))+geom_col(aes(fill = Week))+facet_wrap(~Week)
Some days like Tue in Week 31 and Wed in week 32 have significant variations from the rest of the days.
#Visualizing the mean of total no trips per WeekDay of all stations in every hour
j4 = j1[,mean(N),.(Station,WeekDay,WeekEnd,Hour)]
ggplot(j4,aes(x=WeekDay,y=V1))+geom_col(aes(fill = Hour))+facet_wrap(~Hour)
WeekDay*Hour seems to have a high predictive power. Weekends have more number of trips in the off peak hours.
# merging the three datasets
merge2 = merge(j4, merge11, by.x="Station",by.y ="Station_ID")
head(merge2)
## Station WeekDay WeekEnd Hour V1 WardCode WardName borough NESW
## 1: 12 Mon 0 12 3.857143 E05000129 Bloomsbury Camden Central
## 2: 12 Mon 0 10 1.666667 E05000129 Bloomsbury Camden Central
## 3: 12 Sun 1 16 4.285714 E05000129 Bloomsbury Camden Central
## 4: 12 Wed 0 18 12.571429 E05000129 Bloomsbury Camden Central
## 5: 12 Mon 0 18 14.571429 E05000129 Bloomsbury Camden Central
## 6: 12 Thu 0 18 14.714286 E05000129 Bloomsbury Camden Central
## AreaSqKm lon lat IncomeScor LivingEnSc NoEmployee GrenSpace
## 1: 1 -0.131436 51.52199 0.11 54.65 11.14764 9.2
## 2: 1 -0.131436 51.52199 0.11 54.65 11.14764 9.2
## 3: 1 -0.131436 51.52199 0.11 54.65 11.14764 9.2
## 4: 1 -0.131436 51.52199 0.11 54.65 11.14764 9.2
## 5: 1 -0.131436 51.52199 0.11 54.65 11.14764 9.2
## 6: 1 -0.131436 51.52199 0.11 54.65 11.14764 9.2
## BornUK NotBornUK NoFlats NoHouses NoOwndDwel MedHPrice Capacity Latitude
## 1: 5317 5575 5226 197 1293 12.80765 49 51.52168
## 2: 5317 5575 5226 197 1293 12.80765 49 51.52168
## 3: 5317 5575 5226 197 1293 12.80765 49 51.52168
## 4: 5317 5575 5226 197 1293 12.80765 49 51.52168
## 5: 5317 5575 5226 197 1293 12.80765 49 51.52168
## 6: 5317 5575 5226 197 1293 12.80765 49 51.52168
## Longitude di
## 1: -0.130431 0.07754604
## 2: -0.130431 0.07754604
## 3: -0.130431 0.07754604
## 4: -0.130431 0.07754604
## 5: -0.130431 0.07754604
## 6: -0.130431 0.07754604
dim(merge2)
## [1] 15827 26
length(unique(merge2$Station))
## [1] 113
After merging the 113 stations which have unique census data, all of them have recorded trips.
#Converting the NESW, weekday and hour into categorical attributes.
merge2$WeekDay=factor(merge2$WeekDay,ordered = FALSE)
merge2$Hour=as.factor(merge2$Hour)
merge2$WeekEnd=as.factor(merge2$WeekEnd)
#dropping the non predictive columns and the redundant ones
merge2[,c("WardCode","WardName","NESW","borough","lon","lat")]=NULL
#Removing the null values
merge2=merge2[complete.cases(merge2)]
#Standrdize the numerical independent features.
bike=scale(merge2[,c("AreaSqKm","IncomeScor","LivingEnSc","NoEmployee","GrenSpace","BornUK","NotBornUK","NoFlats","NoHouses","NoOwndDwel","MedHPrice","Capacity","Longitude","Latitude")])
#spliting the stations randomly betweem the training and test with prcentage 80 to 20
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
## The following objects are masked from 'package:rgeos':
##
## intersect, setdiff, union
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
set.seed(20)
bikeTrain = cbind(bike,merge2[,c("V1","WeekDay","WeekEnd","Hour","Station")])
bike1Train = bikeTrain%>%subset(bikeTrain$Station%in%sample(unique(bikeTrain$Station),0.8*length(unique(bikeTrain$Station))))
bikeTest = cbind(bike,merge2[,c("V1","WeekDay","WeekEnd","Hour","Station")])
bike1Test = bikeTest %>%subset(!bikeTest$Station%in%unique(bike1Train$Station))
length(unique(bike1Train$Station))
## [1] 90
length(unique(bike1Test$Station))
## [1] 23
#Dropping the Station ID which is not informative
bike1Train$Station = NULL
bike1Test$Station=NULL
90 random stations will be used in training the models and other different 23 in testing
#Creating a new DataTable for Performance Comparison
perf = data.table()
# Linear Modelling with the signifianctly predictive census features
linearModel = lm(V1 ~(WeekDay+Hour+AreaSqKm+IncomeScor+NoEmployee+GrenSpace+BornUK+NotBornUK+NoFlats+NoOwndDwel+MedHPrice+Capacity+Longitude+Latitude),data=bike1Train)
summary(linearModel)
##
## Call:
## lm(formula = V1 ~ (WeekDay + Hour + AreaSqKm + IncomeScor + NoEmployee +
## GrenSpace + BornUK + NotBornUK + NoFlats + NoOwndDwel + MedHPrice +
## Capacity + Longitude + Latitude), data = bike1Train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.8996 -0.8894 -0.2642 0.4883 28.7164
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.49930 0.09279 16.159 < 2e-16 ***
## WeekDayTue 0.08021 0.05698 1.408 0.159219
## WeekDayWed -0.10623 0.05750 -1.847 0.064708 .
## WeekDayThu 0.01764 0.05713 0.309 0.757492
## WeekDayFri -0.02070 0.05680 -0.364 0.715594
## WeekDaySat 0.07249 0.05676 1.277 0.201531
## WeekDaySun 0.04247 0.05710 0.744 0.457070
## Hour1 -0.08556 0.12714 -0.673 0.500988
## Hour2 -0.18876 0.13542 -1.394 0.163391
## Hour3 -0.15973 0.14763 -1.082 0.279277
## Hour4 -0.32693 0.15366 -2.128 0.033390 *
## Hour5 -0.26748 0.12745 -2.099 0.035860 *
## Hour6 0.20998 0.11216 1.872 0.061206 .
## Hour7 1.28941 0.10989 11.734 < 2e-16 ***
## Hour8 2.80514 0.10895 25.748 < 2e-16 ***
## Hour9 1.16098 0.10870 10.681 < 2e-16 ***
## Hour10 0.71147 0.10882 6.538 6.49e-11 ***
## Hour11 0.87788 0.10885 8.065 8.01e-16 ***
## Hour12 1.09292 0.10865 10.059 < 2e-16 ***
## Hour13 1.13038 0.10879 10.390 < 2e-16 ***
## Hour14 1.06435 0.10886 9.778 < 2e-16 ***
## Hour15 1.13072 0.10872 10.400 < 2e-16 ***
## Hour16 1.37593 0.10872 12.656 < 2e-16 ***
## Hour17 2.15139 0.10856 19.818 < 2e-16 ***
## Hour18 2.03058 0.10859 18.700 < 2e-16 ***
## Hour19 1.16570 0.10869 10.725 < 2e-16 ***
## Hour20 0.54996 0.10889 5.051 4.47e-07 ***
## Hour21 0.28638 0.11018 2.599 0.009352 **
## Hour22 0.18774 0.11057 1.698 0.089558 .
## Hour23 0.10552 0.11297 0.934 0.350288
## AreaSqKm 0.11925 0.02452 4.863 1.17e-06 ***
## IncomeScor -0.34732 0.04019 -8.642 < 2e-16 ***
## NoEmployee 0.13403 0.02476 5.413 6.33e-08 ***
## GrenSpace -0.11647 0.02143 -5.434 5.60e-08 ***
## BornUK -0.04829 0.03714 -1.300 0.193612
## NotBornUK -0.33145 0.02589 -12.804 < 2e-16 ***
## NoFlats 0.13098 0.02773 4.724 2.34e-06 ***
## NoOwndDwel -0.10959 0.03365 -3.257 0.001130 **
## MedHPrice -0.12104 0.03102 -3.901 9.61e-05 ***
## Capacity 0.21120 0.01614 13.087 < 2e-16 ***
## Longitude 0.08838 0.02551 3.464 0.000534 ***
## Latitude 0.31288 0.02230 14.029 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.7 on 12492 degrees of freedom
## Multiple R-squared: 0.2385, Adjusted R-squared: 0.236
## F-statistic: 95.45 on 41 and 12492 DF, p-value: < 2.2e-16
perf$lm_pred = predict(linearModel,bike1Test)
perf$lm_actual = (bike1Test[,"V1"])
library(Metrics)
##
## Attaching package: 'Metrics'
## The following object is masked from 'package:forecast':
##
## accuracy
library(MLmetrics)
##
## Attaching package: 'MLmetrics'
## The following object is masked from 'package:base':
##
## Recall
R2_Score(perf$lm_actual, perf$lm_pred)
## [1] -1.189744
RMSE(perf$lm_actual, perf$lm_pred)
## [1] 1.31405
head(perf$lm_pred)
## [1] 2.847810 2.865452 4.436044 4.342854 2.478746 1.738556
head(perf$lm_actual)
## [1] 2.000000 1.666667 1.000000 1.428571 1.666667 1.500000
Census Data has low impact in explaining the number of trips variaitons.
Input prepation to be a matrix instead of dataframe. x,y for training and x1,y1 for testing.
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:MLmetrics':
##
## MAE, RMSE
## The following objects are masked from 'package:Metrics':
##
## precision, recall
#Encoding the categorical attributes
dmy = dummyVars(~.,data=bike1Train)
trsf = data.frame(predict(dmy,newdata = bike1Train ))
# Creating new features from the encoded variables based on the logic gates concept to reflect the variation of peak hours and it’s relation with the weekend
#Holiday non peak hours
trsf$HNP = (trsf$WeekDay.Sat+trsf$WeekDay.Sun)*(trsf$Hour.7+trsf$Hour.8)
#Holiday peak hours
trsf$HP = (trsf$WeekDay.Sat+trsf$WeekDay.Sun)*(trsf$Hour.11+trsf$Hour.12+trsf$Hour.13+trsf$Hour.14)
# Non peak hours
trsf$NP = trsf$Hour.0+trsf$Hour.1+trsf$Hour.2+trsf$Hour.3+trsf$Hour.4+trsf$Hour.5
# peak hours
trsf$P = trsf$Hour.7+trsf$Hour.8+trsf$Hour.17+trsf$Hour.18
#Convert from DataFrame to Matrix
y <- as.matrix( trsf[, "V1"] )
x <- as.matrix( trsf[, names(trsf)[names(trsf)!="V1"]] )
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:MLmetrics':
##
## AUC
## The following object is masked from 'package:seasonal':
##
## outlier
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
mycorr = corr.test(trsf[,1:15],adjust = "fdr")
library(corrplot)
## corrplot 0.84 loaded
corr.test(trsf[,1:14],trsf[,15],adjust = "fdr")
## Call:corr.test(x = trsf[, 1:14], y = trsf[, 15], adjust = "fdr")
## Correlation matrix
## [,1]
## AreaSqKm 0.05
## IncomeScor -0.06
## LivingEnSc 0.06
## NoEmployee 0.16
## GrenSpace -0.03
## BornUK -0.09
## NotBornUK -0.10
## NoFlats 0.01
## NoHouses -0.08
## NoOwndDwel -0.08
## MedHPrice 0.05
## Capacity 0.11
## Longitude 0.03
## Latitude 0.13
## Sample Size
## [1] 12534
## Probability values adjusted for multiple tests.
## [,1]
## AreaSqKm 0.00
## IncomeScor 0.00
## LivingEnSc 0.00
## NoEmployee 0.00
## GrenSpace 0.00
## BornUK 0.00
## NotBornUK 0.00
## NoFlats 0.29
## NoHouses 0.00
## NoOwndDwel 0.00
## MedHPrice 0.00
## Capacity 0.00
## Longitude 0.00
## Latitude 0.00
##
## To see confidence intervals of the correlations, print with the short=FALSE option
corrplot(mycorr$r,order = "hclust")
No of Employee has a agood correlation with the no of trips (V1)
#Searching for optimum lambda that results in least cost
lambdas_to_try <- 10^seq(-3, 3, length.out = 100)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 3.0-2
# Tuning lambda with 10 folds cross validation
ridge_cv <- cv.glmnet(x, y, alpha = 0, lambda = lambdas_to_try, standardize = FALSE, nfolds = 10)
# ploting lambda versus the mean square error
plot(ridge_cv)
lambda_ridge_cv <- ridge_cv$lambda.min
#Model training using the optimum lambda
model_ridge <- glmnet(x, y, alpha = 0, lambda = lambda_ridge_cv,standardize = FALSE)
#training prediction
y_hat_ridge <- predict(model_ridge, x)
#RSquared for training
rsq_ridge <- cor(y, y_hat_ridge)^2
print(rsq_ridge)
## s0
## [1,] 0.2988686
#Same data preparation for testing
trsf1 = data.frame(predict(dmy,newdata = bike1Test ))
trsf1$HNP = (trsf1$WeekDay.Sat+trsf1$WeekDay.Sun)*(trsf1$Hour.7+trsf1$Hour.8)
trsf1$HP = (trsf1$WeekDay.Sat+trsf1$WeekDay.Sun)*(trsf1$Hour.11+trsf1$Hour.12+trsf1$Hour.13+trsf1$Hour.14)
trsf1$NP = trsf1$Hour.0+trsf1$Hour.1+trsf1$Hour.2+trsf1$Hour.3+trsf1$Hour.4+trsf1$Hour.5
trsf1$P = trsf1$Hour.7+trsf1$Hour.8+trsf1$Hour.17+trsf1$Hour.18
y1 <- as.matrix( trsf1[, "V1"] )
x1 <- as.matrix( trsf1[, names(trsf1)[names(trsf1)!="V1"]] )
#testing prediction
y1_hat_ridge <-predict(model_ridge, x1)
#RSquared for testing
rsq_ridge <- cor(y1, y1_hat_ridge)^2
print(rsq_ridge)
## s0
## [1,] 0.2673336
head(y1)
## [,1]
## [1,] 2.000000
## [2,] 1.666667
## [3,] 1.000000
## [4,] 1.428571
## [5,] 1.666667
## [6,] 1.500000
head(y1_hat_ridge)
## s0
## 1 3.541375
## 2 3.559652
## 3 2.313968
## 4 5.106241
## 5 3.319669
## 6 1.735085
#Beta Coefficients
model_ridge$beta
## 51 x 1 sparse Matrix of class "dgCMatrix"
## s0
## AreaSqKm 0.071735721
## IncomeScor -0.313027263
## LivingEnSc 0.015515343
## NoEmployee 0.149351741
## GrenSpace -0.086053833
## BornUK -0.163430636
## NotBornUK -0.398276136
## NoFlats 0.286676793
## NoHouses 0.188118834
## NoOwndDwel -0.168201685
## MedHPrice -0.164413821
## Capacity 0.209120897
## Longitude 0.077509735
## Latitude 0.314660721
## WeekDay.Mon -0.009858863
## WeekDay.Tue 0.070621382
## WeekDay.Wed -0.117521200
## WeekDay.Thu 0.008417847
## WeekDay.Fri -0.029776990
## WeekDay.Sat 0.065361005
## WeekDay.Sun 0.013208900
## WeekEnd.0 -0.029448943
## WeekEnd.1 0.005965379
## Hour.0 0.046514242
## Hour.1 -0.042270905
## Hour.2 -0.142138979
## Hour.3 -0.114174224
## Hour.4 -0.275590981
## Hour.5 -0.207268627
## Hour.6 -0.528794420
## Hour.7 -0.007959998
## Hour.8 1.576824230
## Hour.9 0.408843040
## Hour.10 -0.036600887
## Hour.11 -0.209933457
## Hour.12 0.006099475
## Hour.13 0.045265874
## Hour.14 -0.025259011
## Hour.15 0.378272257
## Hour.16 0.621496122
## Hour.17 0.153398493
## Hour.18 0.033981591
## Hour.19 0.413072067
## Hour.20 -0.196834562
## Hour.21 -0.456833745
## Hour.22 -0.556728841
## Hour.23 -0.637261984
## HNP -2.922824921
## HP 1.171539151
## NP -0.797240189
## P 1.249754031
Interpretation
Number of trips increase in peak hours and decrease in off peaks. The geographic location for stations and ward center also have high impact on the number of trips.
Number of trips as expected increase with in areas with high Number of employees. Also, people who are not born in UK seem not to get used to ride bikes.
The Newly Engineered features have significant impact. Particularly the Weekend non peak hours(HNP).
set.seed(10)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:seasonal':
##
## outlier
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:gridExtra':
##
## combine
bike.rf = randomForest(x,y)
## Warning in rfout$mse/(var(y) * (n - 1)/n): Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
summary(bike.rf)
## Length Class Mode
## call 3 -none- call
## type 1 -none- character
## predicted 12534 -none- numeric
## mse 500 -none- numeric
## rsq 500 -none- numeric
## oob.times 12534 -none- numeric
## importance 51 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 12534 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
perf$rf_pred = predict(bike.rf,x1)
perf$rf_actual = (bike1Test[,"V1"])
library(Metrics)
library(MLmetrics)
R2_Score(perf$rf_actual, perf$rf_pred)
## [1] -0.09660702
RMSE(perf$rf_actual, perf$rf_pred)
## [1] 1.222165
head(perf$rf_pred)
## [1] 2.868576 3.173746 1.352768 5.315107 2.841447 1.645584
head(perf$rf_actual)
## [1] 2.000000 1.666667 1.000000 1.428571 1.666667 1.500000
bike.rf$importance
## IncNodePurity
## AreaSqKm 1126.78503
## IncomeScor 836.35934
## LivingEnSc 1240.07743
## NoEmployee 3268.23550
## GrenSpace 959.97117
## BornUK 1110.33004
## NotBornUK 2217.09415
## NoFlats 1065.42380
## NoHouses 1496.89526
## NoOwndDwel 1107.23894
## MedHPrice 1214.14197
## Capacity 1879.11748
## Longitude 1139.16170
## Latitude 1815.61659
## WeekDay.Mon 375.74431
## WeekDay.Tue 410.66565
## WeekDay.Wed 357.70574
## WeekDay.Thu 340.27800
## WeekDay.Fri 351.48297
## WeekDay.Sat 409.64912
## WeekDay.Sun 458.50916
## WeekEnd.0 1332.96596
## WeekEnd.1 1315.29196
## Hour.0 47.51124
## Hour.1 40.18374
## Hour.2 58.74811
## Hour.3 23.72540
## Hour.4 22.20557
## Hour.5 34.31158
## Hour.6 453.08315
## Hour.7 1079.16587
## Hour.8 2168.96918
## Hour.9 577.18885
## Hour.10 211.60929
## Hour.11 182.37521
## Hour.12 225.44181
## Hour.13 247.40891
## Hour.14 261.41202
## Hour.15 583.54815
## Hour.16 979.31344
## Hour.17 2108.07515
## Hour.18 753.99875
## Hour.19 404.17161
## Hour.20 181.92696
## Hour.21 293.61683
## Hour.22 296.52897
## Hour.23 337.18083
## HNP 850.68423
## HP 984.41358
## NP 1473.87563
## P 3049.38250
Hours and weekdays are the most informative attributes in decreasing the impurities. Number of Employee have a significant impact as well on the average of number of trips per hour per station .
Ridge has better performance with R2=0.26 on the testing set from linear model. Also, Random Forest has a good performance with R2=-0.09 and RMSE 1.2 on testing .